1 Guided analysis

1.1 Initial data handling

The first step in this analysis would be to import all libraries that are going to be used during these tasks:

library(tidyverse)
library(ggplot2)
library(plotly)
library(countrycode)
library(kableExtra)

Then reading in the gapminder_clean.csv data as a tibble using read_csv and renaming columns to avoid the utilization of extensive names:

data <- read.csv("gapminder_clean.csv") %>%
  as_tibble() %>%
  rename(co2_emissions=CO2.emissions..metric.tons.per.capita.) %>% 
  rename(population_density=Population.density..people.per.sq..km.of.land.area.) %>%
  rename(imports=Imports.of.goods.and.services....of.GDP.) %>% 
  rename(energy_use=Energy.use..kg.of.oil.equivalent.per.capita.) %>%
  rename(life_expectancy=Life.expectancy.at.birth..total..years.)

1.2 CO2 and GDP per capita

First off, it’s requested that I plot a graph of countries’ CO2 emissions matched with their GDP per capita in 1962. To achieve that, we must filter the data so we get just the data on the relevant year:

data_on_62 <- data %>%
  filter(Year == 1962)

With this data, we can already plot the respective dot plot:

ggplot(data_on_62, aes(x = co2_emissions, y = gdpPercap)) +
  geom_point() +
  labs(
    x = "CO2 emissions (metric tons per capita)", y = "GDP per capita",
    title = "GDP per capita variation according to CO2 emissions"
  )

And also determine the correlation between these variables:

cor_test_res <- cor.test(data_on_62$co2_emissions, data_on_62$gdpPercap)
cor_test_res
## 
##  Pearson's product-moment correlation
## 
## data:  data_on_62$co2_emissions and data_on_62$gdpPercap
## t = 25.269, df = 106, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8934697 0.9489792
## sample estimates:
##       cor 
## 0.9260817

It is then requested that I calculate in which year the correlation between the CO2 emissions and GDP per capita is stronger. It can be done in the following manner:

cor_table <- data %>%
  select(Country.Name, Year, gdpPercap, co2_emissions) %>%
  drop_na() %>%
  group_by(Year) %>%
  summarize(cor = cor(co2_emissions, gdpPercap))

cor_table <- cbind(Rank = seq(1, 10), cor_table[order(-cor_table$cor), ])

cor_table %>%
  kbl(caption = "Correlation between the CO2 emissions and GDP per capita in its respective years") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
Correlation between the CO2 emissions and GDP per capita in its respective years
Rank Year cor
1 1967 0.9387918
2 1962 0.9260817
3 1972 0.8428986
4 1982 0.8166384
5 1987 0.8095531
6 1992 0.8094316
7 1997 0.8081396
8 2002 0.8006421
9 1977 0.7928336
10 2007 0.7204169

And finally, we will plot the same dot plot as before, but coloring the dots according to their continents and sizing them according to their population:

co2_gdp_scatterplot <- data_on_62 %>%
  select(Country.Name, Year, co2_emissions, continent, pop, gdpPercap) %>%
  drop_na() %>%
  ggplot(aes(
    x = co2_emissions,
    y = gdpPercap,
    color = continent,
    size = pop
  )) +
  geom_point(alpha = 0.5) +
  labs(
    x = "CO2 emissions (metric tons per capita)", y = "GDP per capita",
    title = "GDP per capita variation according to CO2 emissions",
  ) +
  scale_color_discrete(name = "Continent") +
  scale_size("", range = c(1, 10))
ggplotly(co2_gdp_scatterplot)

2 Unguided analysis

2.1 Energy use problem

What is the relationship between continent and ‘Energy use (kg of oil equivalent per capita)’?

As a relation with a categorical predictor variable and a single quantitative outcome to be evaluated in respect to multiple groups, the adequate statistical test to be implemented in this case is ANOVA. And aplying this method, we get the following results:

energy_continent_anova <- aov(energy_use ~ continent, data = data)
summary(energy_continent_anova)
##               Df    Sum Sq   Mean Sq F value Pr(>F)    
## continent      5 8.124e+08 162482656   21.88 <2e-16 ***
## Residuals   1404 1.043e+10   7426183                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1197 observations deleted due to missingness

2.2 Imports problem

Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990?

In contrast, although this case also features a categorical predictor variable and single quantitative outcome, they are grouped according to just two continents. So, in this case we should analyse it with a t-test:

relevant_continents <- c("Europe", "Asia")

data_as_eu_after_90 <- data %>%
  select(Country.Name, Year, imports, continent) %>%
  filter(continent %in% relevant_continents, Year > 1990)

europe_asia_imports_t_test <- t.test(data_as_eu_after_90$imports[data_as_eu_after_90$continent == "Europe"], data_as_eu_after_90$imports[data_as_eu_after_90$continent == "Asia"])
europe_asia_imports_t_test
## 
##  Welch Two Sample t-test
## 
## data:  data_as_eu_after_90$imports[data_as_eu_after_90$continent == "Europe"] and data_as_eu_after_90$imports[data_as_eu_after_90$continent == "Asia"]
## t = -1.3552, df = 137.53, p-value = 0.1776
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -12.433240   2.321099
## sample estimates:
## mean of x mean of y 
##  41.78924  46.84531

2.3 Population density problem

What is the country (or countries) that has the highest ‘Population density (people per sq. km of land area)’ across all years?

We can visualize the question in this problem by plotting a line graph showing the population density evolution across the years:

pop_density_plot <- ggplot(data, aes(x = Year, y = log10(population_density), group = Country.Name, color = Country.Name, label = Country.Name)) +
  geom_line() +
  labs(
    y = "Population density (log10(people/square km of land area))", x = "Year",
    title = "Population density variation according to year per country",
  )
ggplotly(pop_density_plot)

To ascertain which country has the highest average ranking, we can compare them according to a score that consists of the sum of their position in a decreasing ranking of population density spanning all available years. It’s calculated in the following way:

years <- unique(data$Year)
countries <- unique(data$Country.Name[!is.na(data$Country.Name)])

pop_density_ranking <- rep(0, times = length(countries))
names(pop_density_ranking) <- countries

for (x in years) {
  year_data <- data %>%
    select(Country.Name, Year, population_density) %>%
    na.omit() %>%
    filter(Year == x)
  year_data$population_density <- rank(year_data$population_density, na.last = TRUE)
  for (z in year_data$Country.Name) {
    pop_density_ranking[[z]] <- pop_density_ranking[[z]] + year_data$population_density[year_data$Country.Name == z]
  }
}

So the countries with the highest scores are:

pop_density_ranking <- pop_density_ranking %>%
  sort(decreasing = TRUE)

data.frame(Rank = seq(1, 10), Country = names(head(pop_density_ranking, 10)), Score = unname(head(pop_density_ranking, 10))) %>%
  kbl(caption = "Countries with the most population density during the 1962-2007 period") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
Countries with the most population density during the 1962-2007 period
Rank Country Score
1 Macao SAR, China 2553
2 Monaco 2553
3 Hong Kong SAR, China 2537
4 Singapore 2529
5 Gibraltar 2518
6 Bermuda 2506
7 Malta 2498
8 Bangladesh 2476
9 Channel Islands 2474
10 Maldives 2461

Having already calculated the population density score, we can best visualize it by building a choropleth:

country_codes <- countrycode(names(pop_density_ranking), origin = "country.name", destination = "iso3c")
pop_dens_df <- data.frame(country = names(pop_density_ranking), rank = unname(pop_density_ranking), code = country_codes)

geo_config <- list(
  scope = "world",
  showocean = TRUE,
  oceancolor = toRGB("LightBlue"),
  showland = TRUE,
  landcolor = toRGB("#e5ecf6")
)

density_choropleth <- plot_ly(pop_dens_df, type = "choropleth", locations = pop_dens_df$code, z = pop_dens_df$rank, text = pop_dens_df$country, colors = "Reds") %>%
  layout(title = "Population density ranking dominance in the 1962-2007 period)", geo = geo_config) %>%
  colorbar(title = "Arbitrary units")

density_choropleth

2.4 Life expectancy problem

What country (or countries) has shown the greatest increase in ‘Life expectancy at birth, total (years)’ since 1962?

To calculate this, we must get the difference between the life expectancy at the first data point and the last one for each country. We can do such a thing in the following manner:

life_expectancy_diff <- rep(0, times = length(countries))
names(life_expectancy_diff) <- countries

for (z in countries) {
  country_life_exp <- data %>%
    select(Country.Name, Year, life_expectancy) %>%
    filter(Country.Name == z)
  years <- unique(country_life_exp$Year)
  life_expectancy_diff[z] <- country_life_exp$life_expectancy[country_life_exp$Year == tail(years, 1)] - country_life_exp$life_expectancy[country_life_exp$Year == head(years, 1)]
}

life_expectancy_diff <- life_expectancy_diff %>%
  sort(decreasing = TRUE)

Then we can present the countries who had the greatest increases in their life expectancy in a bar plot:

life_expectancy_diff_df <- data.frame(names = factor(names(life_expectancy_diff), levels = names(life_expectancy_diff)), life_expectancy_diff)

top_life_exp_bar_plot <- life_expectancy_diff_df %>%
  head(10) %>%
  ggplot(aes(names, life_expectancy_diff, color = names, fill = names)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(legend.position = "none") +
  labs(
    y = "Life expectancy difference (in years)", x = "Country",
    title = "Most profound gains in life expectancy during 1962-2007 period"
  )

ggplotly(top_life_exp_bar_plot)

As we can also see the worst results in this period:

rev_life_expectancy_diff <- life_expectancy_diff %>%
  sort(decreasing = FALSE)
rev_life_expectancy_diff_df <- life_expectancy_diff_df <- data.frame(names = factor(names(rev_life_expectancy_diff), levels = names(rev_life_expectancy_diff)), rev_life_expectancy_diff)

low_life_exp_bar_plot <- rev_life_expectancy_diff_df %>%
  head(10) %>%
  ggplot(aes(names, rev_life_expectancy_diff, color = names, fill = names)) +
  geom_col() +
  theme(axis.text.x = element_text(angle = 90)) +
  theme(legend.position = "none") +
  labs(
    y = "Life expectancy difference (in years)", x = "Country",
    title = "Worst changes in life expectancy during 1962-2007 period"
  )

ggplotly(low_life_exp_bar_plot)

And at last, a choropleth plot would be useful for presenting the bigger picture:

le_country_codes <- countrycode(names(life_expectancy_diff), origin = "country.name", destination = "iso3c")
life_expectancy_df <- data.frame(country = names(life_expectancy_diff), years = unname(life_expectancy_diff), code = le_country_codes)

geo_config <- list(
  scope = "world",
  showocean = TRUE,
  oceancolor = toRGB("LightBlue"),
  showland = TRUE,
  landcolor = toRGB("#e5ecf6")
)

le_choropleth <- plot_ly(life_expectancy_df, type = "choropleth", locations = life_expectancy_df$code, z = life_expectancy_df$years, text = life_expectancy_df$country, colors = "Greens") %>%
  layout(title = "Changes in life expectancy in the 1962-2007 period)", geo = geo_config) %>%
  colorbar(title = "Years")

le_choropleth